#############################################
## PROGRAM NAME: 202c_msa_regs_med.R       ##
## AUTHOR: MATT MLECZKO                    ##
## INPUTS:                                 ##
##         002_tract_to_pl.Rda             ##
##         002_tract_to_cs.Rda             ##
##         005_msas_keep.Rda               ##
##         005_cc_out.Rda                  ##
##         201_af_msa_fin.Rda              ##
##         201_af_muni_fin.Rda             ##
##                                         ##
## OUTPUTS:                                ##
##          202c_m1_msa_med.Rda -          ##
##          202c_m12_msa_med.Rda           ##
##          202c_m1_msa_noimp_med.Rda -    ##
##          202c_m12_msa_noimp_med.Rda     ##
##                                         ##
## PURPOSE: MSA regressions using          ##
##          ZRI median                     ##
##                                         ##
## LIST OF UPDATES:                        ##
#############################################

#log <- file(# USER DEFINED PATH AND FILE NAME HERE #)
#sink(log, append=TRUE)
#sink(log, append=TRUE, type="message")

set.seed(08542)

## load libraries ##

library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(haven)
library(readxl)
library(mice)
library(Rcpp)
library(basecamb)

## define paths ##
data_path <- # USER DEFINED PATH HERE #
pr_path <- # USER DEFINED PATH HERE #

## set working directory ##
setwd(data_path)

`%notin%` <- Negate(`%in%`)

## create a merge function that creates merge frequency as in Stata ##
## from user rwbuie at stackoverflow: https://stackoverflow.com/questions/30358401/is-there-a-way-to-create-statas-merge-indicator-variable-with-rs-merge ##
stata.merge <- function(x,y, name){
  x$df1 <- 1
  y$df2 <- 2
  df <- merge(x,y, by = name, all = TRUE)
  df$merge.variable <- rowSums(df[,c("df1", "df2")], na.rm=TRUE)
  df$df1 <- NULL
  df$df2<- NULL
  df
  #print(table(df$merge.variable))
  
  ## return the merged dataframe
  return(df)
}

## load analytic files ##

load("002_tract_to_pl.Rda")
load("002_tract_to_cs.Rda")

load("005_msas_keep.Rda")
load("005_cc_out.Rda")

load("201_af_msa_fin.Rda")
load("201_af_muni_fin.Rda")

af.muni.fin <- rename(af.muni.fin, 
                      rb_muniwide_2009 = rb_2009_muniwide,
                      rb_muniwide_2022 = rb_2022_muniwide)


## how extensive does the imputation need to be? ##

summary(af.muni.fin$zri_2006)
summary(af.muni.fin$sindex1_2006)
summary(af.muni.fin$sindex2_2006)
summary(af.muni.fin$sindex3_2006)
summary(af.muni.fin$sindex4_2006)
summary(af.muni.fin$sindex5_2006)
summary(af.muni.fin$sindex6_2006)
summary(af.muni.fin$sindex7_2006)

sapply(af.muni.fin, function(x) sum(is.na(x)))

## prep data for mi ##
af.muni.rd <- af.muni.fin %>%
  select(GEOID,
         pop_total_2009,
         pop_density_2009,
         per_18t34_2009,
         per_35t64_2009,
         per_65over_2009,
         median_age_2009,
         per_asian_2009,
         per_black_2009,
         per_latino_2009,
         per_white_2009,
         entropy_er_2009,
         per_child_2009,
         per_nomove_2009,
         per_immigrant_2009,
         hownrt_2009,
         median_gross_rent_2009,
         median_prop_value_2009,
         vacant_rt_2009,
         median_yr_str_2009,
         median_hhld_inc_2009,
         hhld_pov_rt_2009,
         unem_rt_2009,
         gini_2009,
         oo_hu_2009,
         per_bm_2009,
         sindex1_2006,
         sindex2_2006,
         sindex3_2006,
         sindex4_2006,
         sindex5_2006,
         sindex6_2006,
         sindex7_2006,
         sindex1_2022,
         sindex2_2022,
         sindex3_2022,
         sindex4_2022,
         sindex5_2022,
         sindex6_2022,
         sindex7_2022,
         zri_2006,
         zri_st_2006,
         zri_2022,
         zri_st_2022)

test <- af.muni.rd %>%
  select(-GEOID)

test.cor <- cor(test)

sapply(af.muni.rd, function(x) sum(is.na(x)))

## multiple imputation ## 
## source: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r ##

imp <- mice(af.muni.rd, maxit=0)

## extract predictorMatrix and methods of imputation ##

predM <- imp$predictorMatrix
meth <- imp$method
det(predM)

## leave these out of the predictor matrix ##
predM[, c("GEOID")] <- 0
predM[, c("sindex3_2006")] <- 0
predM[, c("sindex6_2006")] <- 0
predM[, c("sindex7_2006")] <- 0

## check determinant ##
det(predM)

## check the method ##
meth

## run the imputation ##

af.muni.mi1 <- mice(af.muni.rd, m = 10, 
                    predictorMatrix = predM, 
                    method = "cart", print =  FALSE)

## pool all the imputed data sets, along with original ## 

af.muni.complete <- complete(af.muni.mi1, "long", include = T)

summary(af.muni.complete)
sapply(af.muni.complete, function(x) sum(is.na(x)))

## PCA ## 

mi.pca <- function(impu){
  
  mi.pca.data <- af.muni.complete %>%
    filter(.imp == impu) %>%
    select(sindex1_2006, 
           sindex2_2006, 
           sindex3_2006, 
           sindex4_2006, 
           sindex5_2006, 
           sindex6_2006, 
           sindex7_2006)
  
  summary(mi.pca.data)
  
  mi.pca.res <- prcomp(mi.pca.data, center = T, scale=T)
  loadings <- mi.pca.res$rotation
  
  pca.data <- af.muni.complete %>%
    filter(.imp == impu) %>%
    mutate(zri_2006 = sindex1_2006*loadings[1,1] + 
             sindex2_2006*loadings[2,1] + 
             sindex3_2006*loadings[3,1] + 
             sindex4_2006*loadings[4,1] + 
             sindex5_2006*loadings[5,1] + 
             sindex6_2006*loadings[6,1] + 
             sindex7_2006*loadings[7,1])
  
  pca.data$zri_st_2006 <- (pca.data$zri_2006 - mean(pca.data$zri_2006, na.rm=T))/sd(pca.data$zri_2006,na.rm=T)
  
  return(pca.data)
  
}

mi.pca.out <- lapply(c(1,2,3,4,5,6,7,8,9,10), mi.pca)

## extract the pca output ## 

af.muni.out <- bind_rows(mi.pca.out)
summary(af.muni.out)

## get original data ##

af.muni.og <- af.muni.complete %>%
  filter(.imp == 0)

## bring on rest of covariates and outcomes ##

af.muni.m2 <- af.muni.fin %>%
  select(GEOID,
         pop_total_2022,
         pop_density_2022,
         per_18t34_2022,
         per_35t64_2022,
         per_65over_2022,
         median_age_2022,
         per_asian_2022,
         per_black_2022,
         per_latino_2022,
         per_white_2022,
         entropy_er_2022,
         per_child_2022,
         per_nomove_2022,
         per_immigrant_2022,
         hownrt_2022,
         median_gross_rent_2022,
         median_prop_value_2022,
         vacant_rt_2022,
         median_yr_str_2022,
         median_hhld_inc_2022,
         hhld_pov_rt_2022,
         unem_rt_2022,
         gini_2022,
         oo_hu_2022,
         per_bm_2022,
         starts_with("rb_"),
         starts_with("mpv_"),
         starts_with("mgr_"))

## merge ##
af.muni.reg.m <- stata.merge(af.muni.out,
                             af.muni.m2,
                             "GEOID")

## check merge ## 
table(af.muni.reg.m$merge.variable, useNA = "ifany")

## imputed data ##
af.muni.imp <- af.muni.reg.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## now, original data ##

af.muni.m3 <- af.muni.fin %>%
  select(GEOID,
         pop_total_2022,
         pop_density_2022,
         per_18t34_2022,
         per_35t64_2022,
         per_65over_2022,
         median_age_2022,
         per_asian_2022,
         per_black_2022,
         per_latino_2022,
         per_white_2022,
         entropy_er_2022,
         per_child_2022,
         per_nomove_2022,
         per_immigrant_2022,
         hownrt_2022,
         median_gross_rent_2022,
         median_prop_value_2022,
         vacant_rt_2022,
         median_yr_str_2022,
         median_hhld_inc_2022,
         hhld_pov_rt_2022,
         unem_rt_2022,
         gini_2022,
         oo_hu_2022,
         per_bm_2022,
         starts_with("rb_"),
         starts_with("mpv_"),
         starts_with("mgr_"))

## merge ##
af.muni.og.fin.m <- stata.merge(af.muni.og,
                                af.muni.m3,
                                "GEOID")

## check merge ## 
table(af.muni.og.fin.m$merge.variable, useNA = "ifany")

## final reg file ##
af.muni.og.fin <- af.muni.og.fin.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## combine data ##

af.muni.cb <- rbind(af.muni.og.fin,
                    af.muni.imp)

af.muni.cb.fin <- af.muni.cb[order(af.muni.cb$.imp, af.muni.cb$.id),]

## final vars ##

af.muni.final <- af.muni.cb.fin %>%
  mutate(zri.i = case_when(zri_2022 > zri_2006 + 0.5 ~ 1,
                           zri_2022 <= zri_2006 + 0.5 ~ 0),
         zri.d = case_when(zri_2022 < zri_2006 - 0.5 ~ 1,
                           zri_2022 >= zri_2006 - 0.5 ~ 0),
         zri.s = case_when(abs(zri_2022 - zri_2006) <= 0.5 ~ 1,
                           abs(zri_2022 - zri_2006) > 0.5 ~ 0),
         zri.id = abs(zri.i - zri.d),
         sindex1.change = sindex1_2022 - sindex1_2006,
         sindex2.change = sindex2_2022 - sindex2_2006,
         sindex3.change = sindex3_2022 - sindex3_2006,
         sindex4.change = sindex4_2022 - sindex4_2006,
         sindex5.change = sindex5_2022 - sindex5_2006,
         sindex6.change = sindex6_2022 - sindex6_2006,
         sindex7.change = sindex7_2022 - sindex7_2006,
         sindex1.i = case_when(sindex1_2022 > sindex1_2006 ~ 1,
                               sindex1_2022 <= sindex1_2006 ~ 0),
         sindex2.i = case_when(sindex2_2022 > sindex2_2006 ~ 1,
                               sindex2_2022 <= sindex2_2006 ~ 0),
         sindex3.i = case_when(sindex3_2022 > sindex3_2006 ~ 1,
                               sindex3_2022 <= sindex3_2006 ~ 0),
         sindex4.i = case_when(sindex4_2022 > sindex4_2006 ~ 1,
                               sindex4_2022 <= sindex4_2006 ~ 0),
         sindex5.i = case_when(sindex5_2022 > sindex5_2006 ~ 1,
                               sindex5_2022 <= sindex5_2006 ~ 0),
         sindex6.i = case_when(sindex6_2022 > sindex6_2006 ~ 1,
                               sindex6_2022 <= sindex6_2006 ~ 0),
         sindex7.i = case_when(sindex7_2022 > sindex7_2006 ~ 1,
                               sindex7_2022 <= sindex7_2006 ~ 0),
         sindex1.d = case_when(sindex1_2022 < sindex1_2006 ~ 1,
                               sindex1_2022 >= sindex1_2006 ~ 0),
         sindex2.d = case_when(sindex2_2022 < sindex2_2006 ~ 1,
                               sindex2_2022 >= sindex2_2006 ~ 0),
         sindex3.d = case_when(sindex3_2022 < sindex3_2006 ~ 1,
                               sindex3_2022 >= sindex3_2006 ~ 0),
         sindex4.d = case_when(sindex4_2022 < sindex4_2006 ~ 1,
                               sindex4_2022 >= sindex4_2006 ~ 0),
         sindex5.d = case_when(sindex5_2022 < sindex5_2006 ~ 1,
                               sindex5_2022 >= sindex5_2006 ~ 0),
         sindex6.d = case_when(sindex6_2022 < sindex6_2006 ~ 1,
                               sindex6_2022 >= sindex6_2006 ~ 0),
         sindex7.d = case_when(sindex7_2022 < sindex7_2006 ~ 1,
                               sindex7_2022 >= sindex7_2006 ~ 0),
         sindex1.s = case_when(sindex1_2022 == sindex1_2006 ~ 1,
                               sindex1_2022 != sindex1_2006 ~ 0),
         sindex2.s = case_when(sindex2_2022 == sindex2_2006 ~ 1,
                               sindex2_2022 != sindex2_2006 ~ 0),
         sindex3.s = case_when(sindex3_2022 == sindex3_2006 ~ 1,
                               sindex3_2022 != sindex3_2006 ~ 0),
         sindex4.s = case_when(sindex4_2022 == sindex4_2006 ~ 1,
                               sindex4_2022 != sindex4_2006 ~ 0),
         sindex5.s = case_when(sindex5_2022 == sindex5_2006 ~ 1,
                               sindex5_2022 != sindex5_2006 ~ 0),
         sindex6.s = case_when(sindex6_2022 == sindex6_2006 ~ 1,
                               sindex6_2022 != sindex6_2006 ~ 0),
         sindex7.s = case_when(sindex7_2022 == sindex7_2006 ~ 1,
                               sindex7_2022 != sindex7_2006 ~ 0))

summary(af.muni.final$zri.i)
summary(af.muni.final$zri.d)
summary(af.muni.final$zri.s)
summary(af.muni.final$zri.id)

summary(af.muni.final$sindex1.change)
summary(af.muni.final$sindex2.change)
summary(af.muni.final$sindex3.change)
summary(af.muni.final$sindex4.change)
summary(af.muni.final$sindex5.change)
summary(af.muni.final$sindex6.change)
summary(af.muni.final$sindex7.change)

summary(af.muni.final$sindex1.i)
summary(af.muni.final$sindex1.d)
summary(af.muni.final$sindex1.s)

summary(af.muni.final$sindex2.i)
summary(af.muni.final$sindex2.d)
summary(af.muni.final$sindex2.s)

summary(af.muni.final$sindex3.i)
summary(af.muni.final$sindex3.d)
summary(af.muni.final$sindex3.s)

summary(af.muni.final$sindex4.i)
summary(af.muni.final$sindex4.d)
summary(af.muni.final$sindex4.s)

summary(af.muni.final$sindex5.i)
summary(af.muni.final$sindex5.d)
summary(af.muni.final$sindex5.s)

summary(af.muni.final$sindex6.i)
summary(af.muni.final$sindex6.d)
summary(af.muni.final$sindex6.s)

summary(af.muni.final$sindex7.i)
summary(af.muni.final$sindex7.d)
summary(af.muni.final$sindex7.s)


## append original data ##
nzlu.panel.og <- af.muni.final %>%
  filter(.imp == 0) %>%
  mutate(zri_2006 = NA,
         zri.i = NA,
         zri.d = NA,
         zri.s = NA,
         zri.id = NA,
         sindex1.change = NA,
         sindex2.change = NA,
         sindex3.change = NA,
         sindex4.change = NA,
         sindex5.change = NA,
         sindex6.change = NA,
         sindex7.change = NA,
         sindex1.i = NA,
         sindex1.d = NA,
         sindex1.s = NA,
         sindex2.i = NA,
         sindex2.d = NA,
         sindex2.s = NA,
         sindex3.i = NA,
         sindex3.d = NA,
         sindex3.s = NA,
         sindex4.i = NA,
         sindex4.d = NA,
         sindex4.s = NA,
         sindex5.i = NA,
         sindex5.d = NA,
         sindex5.s = NA,
         sindex6.i = NA,
         sindex6.d = NA,
         sindex6.s = NA,
         sindex7.i = NA,
         sindex7.d = NA,
         sindex7.s = NA)

nzlu.panel.cb <- rbind(nzlu.panel.og,
                       af.muni.final)

nzlu.panel.mi2 <- as.mids(af.muni.final)

################################
## aggregate to MSA file here ##
################################

ptm <- tract.to.pl.metro %>%
  mutate(GEOID_muni = paste0(`FIPS State Code`,
                             placefp)) %>%
  select(GEOID_muni,
         placefp,
         placenm,
         cbsa10,
         `CBSA Title`) %>%
  unique() %>%
  rename(GEOID = GEOID_muni,
         cbsaname10 = `CBSA Title`)

ctm <- tract.to.cs.metro %>%
  mutate(GEOID_muni = paste0(`FIPS State Code`,
                             cousubfp),
         GEOID_muni_full = paste0(`FIPS State Code`,
                                  `FIPS County Code`,
                                  cousubfp)) %>%
  filter(GEOID_muni %notin% ptm$GEOID) %>%
  select(GEOID_muni,
         GEOID_muni_full,
         cousubfp,
         cousubnm,
         cbsa10,
         `CBSA Title`) %>%
  unique() %>%
  rename(GEOID = GEOID_muni,
         GEOID_full = GEOID_muni_full,
         cbsaname10 = `CBSA Title`)

## merge checks ## 
nrow(af.muni.final) == length(unique(af.muni.final$GEOID))
class(af.muni.final$GEOID)
range(nchar(trim(af.muni.final$GEOID)))

nrow(ptm) == length(unique(ptm$GEOID))
class(ptm$GEOID)
range(nchar(trim(ptm$GEOID)))

## merge data frames ## 
af.muni.msa.m1 <- stata.merge(af.muni.final,
                              ptm,
                              "GEOID")

## check merge ## 
table(af.muni.msa.m1$merge.variable)

## output non-matches eligible for match with county subs ##
af.muni.msa.nm1 <- af.muni.msa.m1 %>%
  filter(merge.variable ==1) %>%
  select(-placefp,
         -placenm,
         -cbsa10,
         -cbsaname10,
         -merge.variable)

## keep matches ## 
muni.msa.keep1 <- af.muni.msa.m1 %>%
  filter(merge.variable ==3) %>%
  select(-placefp,
         -placenm,
         -merge.variable) %>%
  mutate(GEOID_full = GEOID)

## check for dups ## 
## these are places that span multiple MSAs ##
muni.msa.dupcheck1 <- muni.msa.keep1 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

nrow(muni.msa.dupcheck1)/11

## now, county subs ## 

## merge checks ##
nrow(ctm) == length(unique(ctm$GEOID))
class(ctm$GEOID)
range(nchar(trim(ctm$GEOID)))

nrow(af.muni.msa.nm1) == length(unique(af.muni.msa.nm1$GEOID))
class(af.muni.msa.nm1$GEOID)
range(nchar(trim(af.muni.msa.nm1$GEOID)))

## merge data frames ## 
muni.msa.cs.merge <- stata.merge(af.muni.msa.nm1,
                                 ctm,
                                 "GEOID")

## check merge ##
table(muni.msa.cs.merge$merge.variable)

## keep matches ## 
muni.msa.keep2 <- muni.msa.cs.merge %>%
  filter(merge.variable ==3) %>%
  select(-cousubfp,
         -cousubnm,
         -merge.variable)

## check for dups ## 
## these are places that span multiple MSAs ##
muni.msa.dupcheck2 <- muni.msa.keep2 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

nrow(muni.msa.dupcheck2)/11

## append the two matched dataframes ##

nzlu.wmsas <- rbind(muni.msa.keep1,
                    muni.msa.keep2) %>%
  rename(CBSA = cbsa10)

nzlu.wmsas.fin.dupcheck <- nzlu.wmsas %>%
  group_by(GEOID,CBSA) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

nrow(nzlu.wmsas.fin.dupcheck)/11

## create MSA level file ## 

nzlu.msa.pt1 <- nzlu.wmsas %>%
  group_by(CBSA,.imp) %>%
  summarize(responses = n(),
            sindex1_2006 = median(sindex1_2006,na.rm=T),
            sindex2_2006 = median(sindex2_2006,na.rm=T),
            sindex3_2006 = median(sindex3_2006,na.rm=T),
            sindex4_2006 = median(sindex4_2006,na.rm=T),
            sindex5_2006 = median(sindex5_2006,na.rm=T),
            sindex6_2006 = median(sindex6_2006,na.rm=T),
            sindex7_2006 = median(sindex7_2006,na.rm=T),
            sindex1_2022 = median(sindex1_2022,na.rm=T),
            sindex2_2022 = median(sindex2_2022,na.rm=T),
            sindex3_2022 = median(sindex3_2022,na.rm=T),
            sindex4_2022 = median(sindex4_2022,na.rm=T),
            sindex5_2022 = median(sindex5_2022,na.rm=T),
            sindex6_2022 = median(sindex6_2022,na.rm=T),
            sindex7_2022 = median(sindex7_2022,na.rm=T),
            zri_2006_median = median(zri_2006, na.rm=T),
            zri_2006_range = abs(max(zri_2006, na.rm=T) - min(zri_2006,na.rm=T)),
            zri_2022_median = median(zri_2022, na.rm=T),
            zri_2022_range = abs(max(zri_2022, na.rm=T) - min(zri_2022,na.rm=T))) %>%
  ungroup() %>%
  group_by(.imp) %>%
  mutate(zri_2006_median_st = (zri_2006_median - mean(zri_2006_median, na.rm=T))/sd(zri_2006_median,na.rm=T),
         zri_2006_range_st = (zri_2006_range - mean(zri_2006_range, na.rm=T))/sd(zri_2006_range,na.rm=T),
         zri_2022_median_st = (zri_2022_median - mean(zri_2022_median, na.rm=T))/sd(zri_2022_median,na.rm=T),
         zri_2022_range_st = (zri_2022_range - mean(zri_2022_range, na.rm=T))/sd(zri_2022_range,na.rm=T)) %>%
  ungroup()

summary(nzlu.msa.pt1)

nzlu.msa.pt1look <- nzlu.msa.pt1 %>%
  select(CBSA,
         .imp,
         zri_2006_median,
         zri_2006_median_st,
         zri_2006_range,
         zri_2006_range_st,
         zri_2022_median,
         zri_2022_median_st,
         zri_2022_range,
         zri_2022_range_st)

nzlu.msa.final <- nzlu.msa.pt1 %>%
  inner_join(msas.keep, "CBSA")

nrow(nzlu.msa.final)
summary(nzlu.msa.final) 

muni.nrows <- length(unique(af.muni.final$GEOID))
msa.nrows <- length(unique(nzlu.msa.final$CBSA))

## check ##

nzlu.msa.finlook <- nzlu.msa.final %>%
  select(CBSA,
         .imp,
         zri_2006_median_st,
         zri_2022_median_st)

## bring on covariates and outcomes ##

af.msa.fin.cov <- af.msa.fin %>%
  select(-starts_with("zri"),
         -starts_with("sub_"),
         -starts_with("cc_"),
         -treat_2006,
         -treat_2022)

af.msa.use.m <- stata.merge(nzlu.msa.final,
                            af.msa.fin.cov,
                            "CBSA")

## check merge ##
table(af.msa.use.m$merge.variable)

## keep matches ##
af.msa.use <- af.msa.use.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  group_by(.imp) %>%
  mutate(treat_2006 = ifelse(zri_2006_median_st > mean(zri_2006_median_st, na.rm=T), 1,0),
         treat_2022 = ifelse(zri_2022_median_st > mean(zri_2022_median_st, na.rm=T), 1,0)) %>%
  ungroup()

summary(af.msa.use$treat_2006)
summary(af.msa.use$treat_2022)

table(af.msa.use$treat_2006, useNA = "ifany")
table(af.msa.use$treat_2022, useNA = "ifany")

## summary stats ## 

summary(af.msa.use$sindex1_2006)
sd(af.msa.use$sindex1_2006, na.rm=T)
summary(af.msa.use$sindex2_2006)
sd(af.msa.use$sindex2_2006, na.rm=T)
summary(af.msa.use$sindex3_2006)
sd(af.msa.use$sindex3_2006, na.rm=T)
summary(af.msa.use$sindex4_2006)
sd(af.msa.use$sindex4_2006, na.rm=T)
summary(af.msa.use$sindex5_2006)
sd(af.msa.use$sindex5_2006, na.rm=T)
summary(af.msa.use$sindex6_2006)
sd(af.msa.use$sindex6_2006, na.rm=T)
summary(af.msa.use$sindex7_2006)
sd(af.msa.use$sindex7_2006, na.rm=T)

summary(af.msa.use$sindex1_2022)
sd(af.msa.use$sindex1_2022,na.rm=T)
summary(af.msa.use$sindex2_2022)
sd(af.msa.use$sindex2_2022,na.rm=T)
summary(af.msa.use$sindex3_2022)
sd(af.msa.use$sindex3_2022,na.rm=T)
summary(af.msa.use$sindex4_2022)
sd(af.msa.use$sindex4_2022,na.rm=T)
summary(af.msa.use$sindex5_2022)
sd(af.msa.use$sindex5_2022,na.rm=T)
summary(af.msa.use$sindex6_2022)
sd(af.msa.use$sindex6_2022,na.rm=T)
summary(af.msa.use$sindex7_2022)
sd(af.msa.use$sindex7_2022,na.rm=T)

#######################
## now no imputation ## 
#######################

## merge checks ## 
nrow(af.muni.fin) == length(unique(af.muni.fin$GEOID))
class(af.muni.fin$GEOID)
range(nchar(trim(af.muni.fin$GEOID)))

## merge data frames ## 
af.muni.msa.merge1.noimp <- stata.merge(af.muni.fin,
                                        ptm,
                                        "GEOID")

## check merge ## 
table(af.muni.msa.merge1.noimp$merge.variable)

## output non-matches eligible for match with county subs ##
no.muni.msa.noimp <- af.muni.msa.merge1.noimp %>%
  filter(merge.variable == 1) %>%
  select(-placefp,
         -placenm,
         -cbsa10,
         -cbsaname10,
         -merge.variable)

## keep matches ## 
muni.msa.keep1.noimp <- af.muni.msa.merge1.noimp %>%
  filter(merge.variable ==3) %>%
  select(-placefp,
         -placenm,
         -merge.variable) %>%
  mutate(GEOID_full = GEOID)

## check for dups ## 
## these are places that span multiple MSAs ##
muni.msa.dupcheck1.noimp <- muni.msa.keep1.noimp %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## now, county subs ## 

## merge data frames ## 
muni.msa.cs.merge1.noimp <- stata.merge(no.muni.msa.noimp,
                                        ctm,
                                        "GEOID")
## keep matches ## 
muni.msa.keep2.noimp <- muni.msa.cs.merge1.noimp %>%
  filter(merge.variable ==3) %>%
  select(-cousubfp,
         -cousubnm,
         -merge.variable)

## check for dups ## 
## these are places that span multiple MSAs ##
muni.msa.dupcheck2.noimp <- muni.msa.keep2.noimp %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## append the two matched dataframes ##

nzlu.wmsas.noimp <- rbind(muni.msa.keep1.noimp,
                          muni.msa.keep2.noimp) %>%
  rename(CBSA = cbsa10)

nzlu.wmsas.noimp.findupcheck <- nzlu.wmsas.noimp %>%
  group_by(GEOID, CBSA) %>%
  summarize(n = n(), .groups = "drop") %>%
  filter(n>1)

## create MSA level file ## 

nzlu.msa.pt1.noimp <- nzlu.wmsas.noimp %>%
  group_by(CBSA) %>%
  summarize(responses = n(),
            sindex1_2006 = median(sindex1_2006,na.rm=T),
            sindex2_2006 = median(sindex2_2006,na.rm=T),
            sindex3_2006 = median(sindex3_2006,na.rm=T),
            sindex4_2006 = median(sindex4_2006,na.rm=T),
            sindex5_2006 = median(sindex5_2006,na.rm=T),
            sindex6_2006 = median(sindex6_2006,na.rm=T),
            sindex7_2006 = median(sindex7_2006,na.rm=T),
            sindex1_2022 = median(sindex1_2022,na.rm=T),
            sindex2_2022 = median(sindex2_2022,na.rm=T),
            sindex3_2022 = median(sindex3_2022,na.rm=T),
            sindex4_2022 = median(sindex4_2022,na.rm=T),
            sindex5_2022 = median(sindex5_2022,na.rm=T),
            sindex6_2022 = median(sindex6_2022,na.rm=T),
            sindex7_2022 = median(sindex7_2022,na.rm=T),
            zri_2006_median = median(zri_2006, na.rm=T),
            zri_2006_range = abs(max(zri_2006, na.rm=T) - min(zri_2006,na.rm=T)),
            zri_2022_median = median(zri_2022, na.rm=T),
            zri_2022_range = abs(max(zri_2022, na.rm=T) - min(zri_2022,na.rm=T))) %>%
  ungroup() %>%
  mutate(zri_2006_median_st = (zri_2006_median - mean(zri_2006_median, na.rm=T))/sd(zri_2006_median,na.rm=T),
         zri_2006_range_st = (zri_2006_range - mean(zri_2006_range, na.rm=T))/sd(zri_2006_range,na.rm=T),
         zri_2022_median_st = (zri_2022_median - mean(zri_2022_median, na.rm=T))/sd(zri_2022_median,na.rm=T),
         zri_2022_range_st = (zri_2022_range - mean(zri_2022_range, na.rm=T))/sd(zri_2022_range,na.rm=T))


nzlu.msa.final.noimp <- nzlu.msa.pt1.noimp %>%
  inner_join(msas.keep, "CBSA")

## summary stats ##

summary(nzlu.msa.final.noimp$sindex1_2006)
sd(nzlu.msa.final.noimp$sindex1_2006,na.rm=T)
summary(nzlu.msa.final.noimp$sindex2_2006)
sd(nzlu.msa.final.noimp$sindex2_2006,na.rm=T)
summary(nzlu.msa.final.noimp$sindex3_2006)
sd(nzlu.msa.final.noimp$sindex3_2006,na.rm=T)
summary(nzlu.msa.final.noimp$sindex4_2006)
sd(nzlu.msa.final.noimp$sindex4_2006,na.rm=T)
summary(nzlu.msa.final.noimp$sindex5_2006)
sd(nzlu.msa.final.noimp$sindex5_2006,na.rm=T)
summary(nzlu.msa.final.noimp$sindex6_2006)
sd(nzlu.msa.final.noimp$sindex6_2006,na.rm=T)
summary(nzlu.msa.final.noimp$sindex7_2006)
sd(nzlu.msa.final.noimp$sindex7_2006,na.rm=T)

summary(nzlu.msa.final.noimp$sindex1_2022)
sd(nzlu.msa.final.noimp$sindex1_2022, na.rm=T)
summary(nzlu.msa.final.noimp$sindex2_2022)
sd(nzlu.msa.final.noimp$sindex2_2022, na.rm=T)
summary(nzlu.msa.final.noimp$sindex3_2022)
sd(nzlu.msa.final.noimp$sindex3_2022, na.rm=T)
summary(nzlu.msa.final.noimp$sindex4_2022)
sd(nzlu.msa.final.noimp$sindex4_2022, na.rm=T)
summary(nzlu.msa.final.noimp$sindex5_2022)
sd(nzlu.msa.final.noimp$sindex5_2022, na.rm=T)
summary(nzlu.msa.final.noimp$sindex6_2022)
sd(nzlu.msa.final.noimp$sindex6_2022, na.rm=T)
summary(nzlu.msa.final.noimp$sindex7_2022)
sd(nzlu.msa.final.noimp$sindex7_2022, na.rm=T)


nzlu.msa.final.noimp.use <- nzlu.msa.final.noimp %>%
  filter(!is.na(zri_2006_median_st))

## create MSA no imp data and check ## 

af.msa.fix <- af.msa.fin 

af.msa.fix[af.msa.fix == -Inf] <- NA
af.msa.fix[af.msa.fix == Inf] <- NA


af.msa.noimp.use <- af.msa.fix %>%
  mutate(treat_2006 = ifelse(zri_2006_median_st > mean(zri_2006_median_st,na.rm=T), 1,0),
         treat_2022 = ifelse(zri_2022_median_st > mean(zri_2022_median_st,na.rm=T), 1,0)) %>%
  filter(!is.na(treat_2006))

look1 <- nzlu.msa.final.noimp.use %>%
  select(CBSA,
         zri_2006_median_st,
         zri_2022_median_st)

look2 <- af.msa.noimp.use %>%
  select(CBSA,
         zri_2006_median_st,
         zri_2022_median_st)

test.full <- look1 %>%
  inner_join(look2, "CBSA")

## create data for fixed effects regs ##

af.msa.fedatanoimp.t1 <- af.msa.fin %>%
  select(CBSA,
         zri_2006_range,
         zri_2006_median,
         ends_with("_2009"),
         munis_tot) %>%
  mutate(tp = "t1") %>%
  rename(zri_range = zri_2006_range,
         zri_median = zri_2006_median) %>%
  rename_with(~str_remove(., '_2009')) 


af.msa.fedatanoimp.t2 <- af.msa.fin %>%
  select(-treat_2022,
         -zri_full_2022,
         -cc_zri_2022) %>%
  select(CBSA,
         zri_2022_range,
         zri_2022_median,
         ends_with("_2022"),
         munis_tot) %>%
  mutate(tp = "t2") %>%
  rename(zri_range = zri_2022_range,
         zri_median = zri_2022_median) %>%
  rename_with(~str_remove(., '_2022')) 

af.msa.fedatanoimp.in <- rbind(af.msa.fedatanoimp.t1,
                               af.msa.fedatanoimp.t2) %>%
  mutate(zri_miss_flag = ifelse(is.na(zri_median) | 
                                is.na(zri_range), 1, 0))

zri.miss.fe <- af.msa.fedatanoimp.in %>%
  filter(zri_miss_flag == 1)

af.msa.fedatanoimp <- af.msa.fedatanoimp.in %>%
  filter(CBSA %notin% zri.miss.fe$CBSA)

nrow(af.msa.fedatanoimp.in)
nrow(af.msa.fedatanoimp)

####################
## pooled results ## 
####################

## create data for fixed effects regs ##

af.msa.fedata.t1 <- af.msa.use %>%
  select(CBSA,
         .imp,
         zri_2006_median,
         zri_2006_median_st,
         ends_with("_2009"),
         munis_tot) %>%
  mutate(tp = "t1") %>%
  rename(zri_median = zri_2006_median,
         zri_median_st = zri_2006_median_st) %>%
  rename_with(~str_remove(., '_2009')) 

af.msa.fedata.t2 <- af.msa.use %>%
  select(-treat_2022) %>%
  select(CBSA,
         .imp,
         zri_2022_median,
         zri_2022_median_st,
         ends_with("_2022"),
         munis_tot) %>%
  mutate(tp = "t2") %>%
  select(-starts_with("sindex")) %>%
  rename(zri_median = zri_2022_median,
         zri_median_st = zri_2022_median_st) %>%
  rename_with(~str_remove(., '_2022')) 


af.msa.fedata <- rbind(af.msa.fedata.t1,
                       af.msa.fedata.t2)

msa.nrows.fe <- length(unique(nzlu.msa.final$CBSA))*2

#####################
## begin bootstrap ##
#####################

source(paste0(pr_path,
              "msm_med.R"))

source(paste0(pr_path,
              "msm_noimp_med.R"))

## lists for storing output ##

msa.res1 <- list()
msa.res2 <- list()
msa.res3 <- list()
msa.res4 <- list()
msa.res5 <- list()
msa.res6 <- list()
msa.res7 <- list()
msa.res8 <- list()
msa.res9 <- list()
msa.res10 <- list()
msa.res11 <- list()
msa.res12 <- list()

msa.res.noimp1 <- list()
msa.res.noimp2 <- list()
msa.res.noimp3 <- list()
msa.res.noimp4 <- list()
msa.res.noimp5 <- list()
msa.res.noimp6 <- list()
msa.res.noimp7 <- list()
msa.res.noimp8 <- list()
msa.res.noimp9 <- list()
msa.res.noimp10 <- list()
msa.res.noimp11 <- list()
msa.res.noimp12 <- list()

reps <- 100

for(i in 1:reps){
  
  ## msa level ##
  
  boot.msas <- data.frame(CBSA=sample(af.msa.use$CBSA,
                                      length(unique(af.msa.use$CBSA)),
                                      replace=T))
  
  boot.msa.data <- boot.msas %>%
    left_join(af.msa.use, "CBSA", relationship = "many-to-many") %>%
    arrange(.imp) %>%
    mutate(.id = rep(seq(1,msa.nrows, by=1),11)) %>%
    select(.imp,
           .id,
           CBSA,
           everything())
  
  boot.msa.noimp.data <- boot.msas %>%
    left_join(af.msa.noimp.use, "CBSA", relationship = "many-to-many") %>%
    filter(!is.na(treat_2006) & !is.na(treat_2022))
  
  ## now, do the same for fixed effects data ## 
  
  boot.msa.fedata <- boot.msas %>%
    left_join(af.msa.fedata, "CBSA", relationship = "many-to-many") %>%
    arrange(.imp) %>%
    mutate(.id = rep(seq(1,msa.nrows.fe, by=1),11)) %>%
    select(.imp,
           .id,
           CBSA,
           everything())
  
  boot.msa.noimp.fedata <- boot.msas %>%
    left_join(af.msa.fedatanoimp, "CBSA", relationship = "many-to-many") %>%
    filter(!is.na(zri_median) & 
           !is.na(zri_range))
  
  print("OBS IMP")
  print(nrow(boot.msa.data))
  print(nrow(boot.msa.fedata))
  print("OBS NO IMP")
  print(nrow(boot.msa.noimp.data))
  print(nrow(boot.msa.noimp.fedata))
  
  af.msa.mids <- as.mids(boot.msa.data)
  af.msa.fe.mids <- as.mids(boot.msa.fedata)
  
  ###########################
  ## MSA level regressions ##
  ###########################
  
  msa.res1[[i]] <- msm_med("mgr_npov_2009","mgr_npov_2022","mgr_npov", 1)
  msa.res2[[i]] <- msm_med("mgr_hpov_2009","mgr_hpov_2022","mgr_hpov", 2)
  msa.res3[[i]] <- msm_med("mgr_hpov_cc_2009","mgr_hpov_cc_2022", "mgr_hpov_cc", 3)
  msa.res4[[i]] <- msm_med("rb_npov_2009","rb_npov_2022","rb_npov", 4)
  msa.res5[[i]] <- msm_med("rb_hpov_2009","rb_hpov_2022","rb_hpov", 5)
  msa.res6[[i]] <- msm_med("rb_hpov_cc_2009","rb_hpov_cc_2022","rb_hpov_cc", 6)
  msa.res7[[i]] <- msm_med("mpv_npov_2009","mpv_npov_2022","mpv_npov", 7)
  msa.res8[[i]] <- msm_med("mpv_hpov_2009","mpv_hpov_2022","mpv_hpov", 8)
  msa.res9[[i]] <- msm_med("mpv_hpov_cc_2009","mpv_hpov_cc_2022","mpv_hpov_cc", 9)
  msa.res10[[i]] <- msm_med("eli_ushare_2009","eli_ushare_2022","eli_ushare", 10)
  msa.res11[[i]] <- msm_med("vli_ushare_2009","vli_ushare_2022", "vli_ushare", 11)
  msa.res12[[i]] <- msm_med("li_ushare_2009","li_ushare_2022","li_ushare", 12)
  
  msa.res.noimp1[[i]] <- msm_noimp_med("mgr_npov_2009","mgr_npov_2022","mgr_npov", 1)
  msa.res.noimp2[[i]] <- msm_noimp_med("mgr_hpov_2009","mgr_hpov_2022", "mgr_hpov",2)
  msa.res.noimp3[[i]] <- msm_noimp_med("mgr_hpov_cc_2009","mgr_hpov_cc_2022", "mgr_hpov_cc", 3)
  msa.res.noimp4[[i]] <- msm_noimp_med("rb_npov_2009","rb_npov_2022","rb_npov", 4)
  msa.res.noimp5[[i]] <- msm_noimp_med("rb_hpov_2009","rb_hpov_2022","rb_hpov", 5)
  msa.res.noimp6[[i]] <- msm_noimp_med("rb_hpov_cc_2009","rb_hpov_cc_2022","rb_hpov_cc", 6)
  msa.res.noimp7[[i]] <- msm_noimp_med("mpv_npov_2009","mpv_npov_2022","mpv_npov", 7)
  msa.res.noimp8[[i]] <- msm_noimp_med("mpv_hpov_2009","mpv_hpov_2022","mpv_hpov", 8)
  msa.res.noimp9[[i]] <- msm_noimp_med("mpv_hpov_cc_2009","mpv_hpov_cc_2022","mpv_hpov_cc", 9)
  msa.res.noimp10[[i]] <- msm_noimp_med("eli_ushare_2009","eli_ushare_2022","eli_ushare", 10)
  msa.res.noimp11[[i]] <- msm_noimp_med("vli_ushare_2009","vli_ushare_2022","vli_ushare", 11)
  msa.res.noimp12[[i]] <- msm_noimp_med("li_ushare_2009","li_ushare_2022","li_ushare", 12)
  
  
}

## save the results ## 

save(msa.res1,
     file = "202c_m1_msa_med.Rda")

save(msa.res2,
     file = "202c_m2_msa_med.Rda")

save(msa.res3,
     file = "202c_m3_msa_med.Rda")

save(msa.res4,
     file = "202c_m4_msa_med.Rda")

save(msa.res5,
     file = "202c_m5_msa_med.Rda")

save(msa.res6,
     file = "202c_m6_msa_med.Rda")

save(msa.res7,
     file = "202c_m7_msa_med.Rda")

save(msa.res8,
     file = "202c_m8_msa_med.Rda")

save(msa.res9,
     file = "202c_m9_msa_med.Rda")

save(msa.res10,
     file = "202c_m10_msa_med.Rda")

save(msa.res11,
     file = "202c_m11_msa_med.Rda")

save(msa.res12,
     file = "202c_m12_msa_med.Rda")


save(msa.res.noimp1,
     file = "202c_m1_msa_noimp_med.Rda")

save(msa.res.noimp2,
     file = "202c_m2_msa_noimp_med.Rda")

save(msa.res.noimp3,
     file = "202c_m3_msa_noimp_med.Rda")

save(msa.res.noimp4,
     file = "202c_m4_msa_noimp_med.Rda")

save(msa.res.noimp5,
     file = "202c_m5_msa_noimp_med.Rda")

save(msa.res.noimp6,
     file = "202c_m6_msa_noimp_med.Rda")

save(msa.res.noimp7,
     file = "202c_m7_msa_noimp_med.Rda")

save(msa.res.noimp8,
     file = "202c_m8_msa_noimp_med.Rda")

save(msa.res.noimp9,
     file = "202c_m9_msa_noimp_med.Rda")

save(msa.res.noimp10,
     file = "202c_m10_msa_noimp_med.Rda")

save(msa.res.noimp11,
     file = "202c_m11_msa_noimp_med.Rda")

save(msa.res.noimp12,
     file = "202c_m12_msa_noimp_med.Rda")



### END OF PROGRAM ###

#sink()